For profiling I used TAU (https://tau.uoregon.edu/), since I wanted to try it out. TAU can report how much time was spent on each function, and I used this to report only the time taken on each call to count_sort (or qsort).

For statistical significance each timing should be repeated at least 10 times and so take mean and standard deviation in consideration. I had part of the analysis prepared for this, but not all simulations concluded in time and I reverted to the single case.

Python modules used in analysis:



In [1]:

    
%matplotlib inline
from IPython.core.display import HTML

from collections import defaultdict

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from analysis_scripts import prepare_panel

A

i-loop

In the i-loop case $i, j, \text{count}$ can be private.

$a$ and $\text{temp}$ should be shared, since $a$ is read-only but can be big (and so have a substantial overhead if it is private) and $\text{temp}$ is written by all threads, but only at an specific position for each thread (so there aren't race conditions).

$n$ is a constant in this context, but it needs to be shared or each thread is going to allocate a new variable, and since it is not initialized there will be errors.



In [30]:

    
!pygmentize -l c -f html -O full -o ./iloop.html ../src/omp_iloop.c
HTML(open('iloop.html', 'r').read())









    Out[30]:









  
  
  




#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#ifdef _OPENMP
#include <omp.h>
#endif

#ifndef VECTOR_SIZE
#define VECTOR_SIZE 100
#endif


void count_sort(int a[], int n) {
  int i, j, count;
  int *temp = malloc(n*sizeof(int));

  #pragma omp parallel for default(none) private(i, j, count) shared(a, temp, n)
  for (i = 0; i < n; i++) {
    count = 0;
    for (j = 0; j < n; j++) {
      if (a[j] < a[i])
        count++;
      else if (a[j] == a[i] && j < i)
        count++;
    }
    temp[count] = a[i];
  }

  #ifndef _OPENMP
  memcpy(a, temp, n*sizeof(int));
  #else

  #pragma omp parallel
  {
    int thread_count = omp_get_num_threads();
    int local_n = n / thread_count;

    #pragma omp parallel for default(none) private(i) shared(n, a, temp, local_n, thread_count)
    for (int i = 0; i < thread_count; i++) {
      int start = i * local_n;
      int nelem = local_n;
      if (i == thread_count - 1) {
        nelem = n - start;
      }
      memcpy(a + start, temp + start, nelem * sizeof(int));
    }
  }

  #endif
  free(temp);
} /* Count sort */


int init_vector(int vect[]) {
    for (int i = 0; i < VECTOR_SIZE; ++i ) {
      vect[i] = rand() % 100;
    }
    return 0;
}


int main(int argc, char** argv) {
  int vector[VECTOR_SIZE];

  init_vector(vector);

  count_sort(vector, VECTOR_SIZE);

  for(int i = 1; i < VECTOR_SIZE; i++) {
    if (vector[i] < vector[i - 1]) {
      return 1;
    }
  }

  return 0;
}

j-loop

In the j-loop case $j$ should be private. $\text{count}$ needs to be shared, since every thread might potentially update it. This also means threads need to wait if another thread is already updating $\text{count}$.

$a$ is read-only but can be big (and so have a substantial overhead if it is private), so I made it shared.

$i$ and $n$ are constant in this context, but they needs to be shared or each thread is going to allocate a new variable, and since it is not initialized there will be errors.



In [3]:

    
!pygmentize -l c -f html -O full -o ./jloop.html ../src/omp_jloop.c
HTML(open('jloop.html', 'r').read())









    Out[3]:









  
  
  




#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#ifndef VECTOR_SIZE
#define VECTOR_SIZE 100
#endif

#ifdef _OPENMP
#include <omp.h>
#endif

void count_sort(int a[], int n) {
  int i, j, count;
  int *temp = malloc(n*sizeof(int));

  for (i = 0; i < n; i++) {
    count = 0;
    #pragma omp parallel for default(none) private(j) shared(count, i, n, a)
    for (j = 0; j < n; j++) {
      if (a[j] < a[i])
        count++;
      else if (a[j] == a[i] && j < i)
        count++;
    }
    temp[count] = a[i];
  }

  #ifndef _OPENMP
  memcpy(a, temp, n*sizeof(int));
  #else

  #pragma omp parallel
  {
    int thread_count = omp_get_num_threads();
    int local_n = n / thread_count;

    #pragma omp parallel for default(none) private(i) shared(n, a, temp, local_n, thread_count)
    for (int i = 0; i < thread_count; i++) {
      int start = i * local_n;
      int nelem = local_n;
      if (i == thread_count - 1) {
        nelem = n - start;
      }
      memcpy(a + start, temp + start, nelem * sizeof(int));
    }
  }

  #endif
  free(temp);
} /* Count sort */


int init_vector(int vect[]) {
    for (int i = 0; i < VECTOR_SIZE; ++i ) {
      vect[i] = rand() % 100;
    }
    return 0;
}


int main(int argc, char** argv) {
  int vector[VECTOR_SIZE];

  init_vector(vector);

  count_sort(vector, VECTOR_SIZE);

  for(int i = 1; i < VECTOR_SIZE; i++) {
    if (vector[i] < vector[i - 1]) {
      return 1;
    }
  }

  return 0;
}

B

I decided to use $n = 10, 1000, 1000, 10000, 1000000$, and $1, 2, 4, 8, 10, 16, 20$ threads.

For the Xeon Phi I used $1, 2, 4, 8, 16, 32, 64, 128, 240$ threads



In [4]:

    
VECTOR_SIZE = [10, 100, 1000, 10000, 100000]
THREADS = [1, 2, 4, 8, 10, 16, 20]
PHI_THREADS = [1, 2, 4, 8, 16, 32, 64, 128, 240]

qsort

This is my implementation using qsort.

To keep it consistent with i-loop and j-loop OpenMP implementation (and so share the analysis code), I implemented the qsort call inside another function, $\text{qsort_lib}$.



In [5]:

    
!pygmentize -l c -f html -O full -o ./qsort.html ../src/qsort.c
HTML(open('qsort.html', 'r').read())









    Out[5]:









  
  
  




#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#ifndef VECTOR_SIZE
#define VECTOR_SIZE 100
#endif


int cmpfunc (const void * a, const void * b)
{
   return ( *(int*)a - *(int*)b );
}


void qsort_lib(int a[], int n) {

  qsort(a, n, sizeof(int), cmpfunc);

} /* stdlib qsort */


int init_vector(int vect[]) {
    for (int i = 0; i < VECTOR_SIZE; ++i ) {
      vect[i] = rand() % 100;
    }
    return 0;
}


int main(int argc, char** argv) {
  int vector[VECTOR_SIZE];

  init_vector(vector);

  qsort_lib(vector, VECTOR_SIZE);

  for(int i = 1; i < VECTOR_SIZE; i++) {
    if (vector[i] < vector[i - 1]) {
      return 1;
    }
  }

  return 0;
}

Preparing the executables and profiling

These commands create the executables, submit it to run on the HPCC job system and finally extract the raw TAU info into a text file. They are all implemented in the Makefile.



In [6]:

    
!cd .. && make exec/qsort-GNU
!cd .. && make workdir/qsort.pbs
!cd .. && make TAU_PROFILES_QSORT









    



make: Nothing to be done for `exec/qsort-GNU'.
make: Nothing to be done for `workdir/qsort.pbs'.
make: Nothing to be done for `TAU_PROFILES_QSORT'.



In [7]:

    
data = prepare_panel('qsort', threads=[1])
pn_qsort = pd.Panel4D(data)
# dimensions: pn[vector size, threads, function name, metric]
pn_qsort.ix[:,:,'qsort_lib','Inclusive usec/call']



In [9]:

    
pn_qsort.ix[:,:,'qsort_lib','Inclusive usec/call'].T.plot(marker='o', logy=True, logx=True)
plt.xlabel('Number of threads')
plt.ylabel('Microseconds')









    Out[9]:





<matplotlib.text.Text at 0x2ac92e948dd0>

i-loop



In [12]:

    
!cd .. && make exec/omp_iloop-GNU
!cd .. && make workdir/omp_iloop.pbs
!cd .. && make TAU_PROFILES_ILOOP









    



make: Nothing to be done for `exec/omp_iloop-GNU'.
make: Nothing to be done for `workdir/omp_iloop.pbs'.
make: Nothing to be done for `TAU_PROFILES_ILOOP'.



In [13]:

    
data = prepare_panel('omp_iloop')
pn_iloop = pd.Panel4D(data)
# dimensions: pn[vector size, threads, function name, metric]
pn_iloop.ix[:,:,'count_sort','Inclusive usec/call']



In [14]:

    
pn_iloop.ix[:,:,'count_sort','Inclusive usec/call'].plot(marker='o', logx=True, logy=True)
plt.xlabel('Number of threads')
plt.ylabel('Microseconds')









    Out[14]:





<matplotlib.text.Text at 0x2ac92e8904d0>

Speedup

Using threads = 1 as base case



In [15]:

    
speedup_iloop = pn_iloop.ix[:,1,'count_sort','Inclusive usec/call'] / pn_iloop.ix[:,:,'count_sort','Inclusive usec/call']
speedup_iloop









    Out[15]:






  
    
      
      10
      100
      1000
      10000
      100000
    
  
  
    
      1 
                  1
                 1
              1
              1
              1
    
    
      2 
          0.1007752
         0.4709302
        1.90519
       1.999796
       1.964242
    
    
      4 
         0.05963303
         0.2892857
       3.205366
       3.719328
       3.797029
    
    
      8 
         0.03178484
         0.1680498
       4.640537
       6.921917
       7.151758
    
    
      10
          0.0254902
         0.1539924
       5.282154
       8.998623
       8.435319
    
    
      16
         0.01520468
         0.1007463
       4.948042
       14.11553
       14.33872
    
    
      20
       0.0002135033
       0.001315918
       3.124584
       5.794386
       17.14861



In [16]:

    
speedup_iloop.plot()
plt.plot(THREADS, THREADS, '--', label='Linear speedup')
plt.legend(loc='best')
plt.xlabel('Number of threads')
plt.ylabel('Speedup')
plt.title("$i$-loop parallel for")









    Out[16]:





<matplotlib.text.Text at 0x2ac92ed29ed0>

As expected, for small vector sizes synchronization and small workloads make the function actually slower than the sequential case.

For bigger vector sizes there is a significant speedup, specially in the $n=100000$ case.

Probably for $t=20, v=10000$ there was a competition for resources in other processes running on the node (and that's why more repetition and means/standard deviation is needed).

Using qsort as base case



In [17]:

    
speedup_iloop_qsort = pn_qsort.ix[:,1,'qsort_lib','Inclusive usec/call'] / pn_iloop.ix[:,:,'count_sort','Inclusive usec/call']
speedup_iloop_qsort









    Out[17]:






  
    
      
      10
      100
      1000
      10000
      100000
    
  
  
    
      1 
           1.230769
        1.506173
       0.3081723
       0.03811119
       0.002203739
    
    
      2 
           0.124031
       0.7093023
       0.5871267
        0.0762146
       0.004328676
    
    
      4 
          0.0733945
       0.4357143
       0.9878049
         0.141748
        0.00836766
    
    
      8 
          0.0391198
        0.253112
        1.430085
        0.2638025
        0.01576061
    
    
      10
         0.03137255
       0.2319392
        1.627814
        0.3429482
        0.01858924
    
    
      16
         0.01871345
       0.1517413
        1.524849
        0.5379597
        0.03159879
    
    
      20
       0.0002627732
        0.001982
       0.9629101
        0.2208309
        0.03779105



In [18]:

    
speedup_iloop_qsort.plot()
plt.legend(loc='best')
plt.xlabel('Number of threads')
plt.ylabel('Speedup')
plt.title("$i$-loop parallel for")









    Out[18]:





<matplotlib.text.Text at 0x2ac92ef6a390>

Using qsort as base case we can see how a better algorithm is more important on the long run. Since qsort is $O(n\times log n)$ in the average case, the parallel optimization of a $O(n^2)$ algorithm is still slower.

Despite that, for small vector sizes count_sort was actually faster on the sequential case.

j-loop



In [24]:

    
!cd .. && make exec/omp_jloop-GNU
!cd .. && make workdir/omp_jloop.pbs
!cd .. && make TAU_PROFILES_JLOOP









    



make: Nothing to be done for `TAU_PROFILES_JLOOP'.



In [26]:

    
data = prepare_panel('omp_jloop')
pn_jloop = pd.Panel4D(data)
# dimensions: pn[vector size, threads, function name, metric]
pn_jloop.ix[:,:,'count_sort','Inclusive usec/call']



In [27]:

    
pn_jloop.ix[:,:,'count_sort','Inclusive usec/call'].plot(marker='o', logx=True, logy=True)
plt.xlabel('Number of threads')
plt.ylabel('Microseconds')









    Out[27]:





<matplotlib.text.Text at 0x2ac92f04c390>

Speedup

Using threads = 1 as base case



In [28]:

    
speedup_jloop = pn_jloop.ix[:,1,'count_sort','Inclusive usec/call'] / pn_jloop.ix[:,:,'count_sort','Inclusive usec/call']
speedup_jloop









    Out[28]:






  
    
      
      10
      100
      1000
      10000
      100000
    
  
  
    
      1 
                1
                  1
               1
               1
               1
    
    
      2 
        0.1473684
          0.2036613
       0.6086393
       0.9922183
       0.6466596
    
    
      4 
       0.08860759
          0.1724806
       0.4178033
       0.5031005
       0.4379175
    
    
      8 
       0.05490196
          0.1222527
       0.4266594
       0.4242426
        0.655788
    
    
      10
       0.04713805
          0.1115288
       0.3081802
       0.4445944
       0.5044383
    
    
      16
       0.03655352
         0.08248378
       0.3062245
       0.4657762
       0.4962317
    
    
      20
       0.03063457
       0.0007572084
       0.2674132
       0.1189727
       0.4820026



In [29]:

    
speedup_jloop.plot()
plt.legend(loc='best')
plt.xlabel('Number of threads')
plt.ylabel('Speedup')
plt.title("$j$-loop parallel for")









    Out[29]:





<matplotlib.text.Text at 0x2ac92f04c150>

For the j-loop case adding more threads just made the function slower, and this is expected because $\text{count}$ needs to be synchronized and other threads need to wait to update it.

Using qsort as base case



In [37]:

    
speedup_jloop_qsort = pn_qsort.ix[:,1,'qsort_lib','Inclusive usec/call'] / pn_jloop.ix[:,:,'count_sort','Inclusive usec/call']
speedup_jloop_qsort









    Out[37]:






  
    
      
      10
      100
      1000
      10000
      100000
    
  
  
    
      1 
        0.8571429
        1.576471
        0.3314759
       0.03836686
       0.002292281
    
    
      2 
       0.08510638
       0.3059361
        0.1791616
       0.02760308
       0.002624113
    
    
      4 
       0.09022556
       0.2696177
       0.09060223
       0.01911403
        0.00076936
    
    
      8 
       0.04562738
       0.1654321
        0.1081519
        0.0137191
        0.00086816
    
    
      10
       0.04669261
       0.1398747
        0.2314163
       0.01711848
       0.002791292
    
    
      16
       0.03252033
       0.1392931
       0.09790054
       0.01741024
       0.001071169
    
    
      20
       0.01355932
       0.1247672
       0.04817563
       0.01558582
       0.001039703



In [39]:

    
speedup_jloop_qsort.plot()
plt.legend(loc='best')
plt.xlabel('Number of threads')
plt.ylabel('Speedup')
plt.title("$i$-loop parallel for")









    Out[39]:





<matplotlib.text.Text at 0x2abaa011b510>

For the j-loop case adding more threads just made the function slower, and this is expected because $\text{count}$ needs to be synchronized and other threads need to wait to update it.

C

qsort

For the Xeon Phi implementation of qsort I had to declare cmpfunc as 'runnable' on a MIC architecture. The only modification besides that is the #pragma offload over the qsort call.



In [59]:

    
!pygmentize -l c -f html -O full -o ./mic_qsort.html ../src/mic_qsort.c
HTML(open('mic_qsort.html', 'r').read())









    Out[59]:









  
  
  




#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#ifndef VECTOR_SIZE
#define VECTOR_SIZE 100
#endif


__attribute__((target(mic)))
int cmpfunc (const void * a, const void * b)
{
   return ( *(int*)a - *(int*)b );
}


void qsort_lib(int a[], int n) {

  #pragma offload target(mic) inout(a:length(n))
  qsort(a, n, sizeof(int), cmpfunc);

} /* stdlib qsort */


int init_vector(int vect[]) {
    for (int i = 0; i < VECTOR_SIZE; ++i ) {
      vect[i] = rand() % 100;
    }
    return 0;
}


int main(int argc, char** argv) {
  int vector[VECTOR_SIZE];

  init_vector(vector);

  qsort_lib(vector, VECTOR_SIZE);

  return 0;
}

i-loop

For the i-loop Xeon Phi implementation I added the #pragma offload before the OpenMP parallel for.



In [60]:

    
!pygmentize -l c -f html -O full -o ./mic_iloop.html ../src/mic_iloop.c
HTML(open('mic_iloop.html', 'r').read())









    Out[60]:









  
  
  




#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#ifndef VECTOR_SIZE
#define VECTOR_SIZE 100
#endif

#ifdef _OPENMP
#include <omp.h>
#endif

void count_sort(int a[], int n) {
  int i, j, count;
  int *temp = malloc(n*sizeof(int));

  #pragma offload target(mic) in(a:length(n)) out(temp:length(n))
  {
  #pragma omp parallel for default(none) private(i, j, count) shared(a, temp, n)
  for (i = 0; i < n; i++) {
    count = 0;
    for (j = 0; j < n; j++) {
      if (a[j] < a[i])
        count++;
      else if (a[j] == a[i] && j < i)
        count++;
    }
    temp[count] = a[i];
  }
  }

  #ifndef _OPENMP
  memcpy(a, temp, n*sizeof(int));
  #else

  #pragma omp parallel
  {
    int thread_count = omp_get_num_threads();
    int local_n = n / thread_count;

    #pragma omp parallel for default(none) private(i) shared(n, a, temp, local_n, thread_count)
    for (int i = 0; i < thread_count; i++) {
      int start = i * local_n;
      int nelem = local_n;
      if (i == thread_count - 1) {
        nelem = n - start;
      }
      memcpy(a + start, temp + start, nelem * sizeof(int));
    }
  }

  #endif
  free(temp);
} /* Count sort */


int init_vector(int vect[]) {
    for (int i = 0; i < VECTOR_SIZE; ++i ) {
      vect[i] = rand() % 100;
    }
    return 0;
}


int main(int argc, char** argv) {
  int vector[VECTOR_SIZE];

  init_vector(vector);

  count_sort(vector, VECTOR_SIZE);

  return 0;
}

Speedup

Since I used TAU for the other analysis, I wanted to use TAU for the Xeon Phi cases too. But HPCC didn't have it installed at first, and after it was installed the profile data wasn't generated. I contacted HPCC support but didn't got an answer in time for the homework.

D

It should be compute-bound, but ends up being memory-bound because of the bad memory access pattern. For each element of the array all the previous elements need to be accessed again, and more efficient sorting algorithms (like merge sort or even quicksort) can have better access patterns than count sort.

	10	100	1000	10000	100000
1	13	81	6571	666366	64873847
2	129	172	3449	333217	33027419
4	218	280	2050	179163	17085422
8	409	482	1416	96269	9071035
10	510	526	1244	74052	7690740
16	855	804	1328	47208	4524382
20	60889	61554	2103	115002	3783039

	10	100	1000	10000	100000
1	14	89	7045	658315	66863486
2	95	437	11575	663478	103398272
4	158	516	16862	1308516	152685130
8	255	728	16512	1551742	101959000
10	297	798	22860	1480709	132550371
16	383	1079	23006	1413372	134742461
20	457	117537	26345	5533326	138720191

	10	100	1000	10000	100000
1	1	1	1	1	1
2	0.1007752	0.4709302	1.90519	1.999796	1.964242
4	0.05963303	0.2892857	3.205366	3.719328	3.797029
8	0.03178484	0.1680498	4.640537	6.921917	7.151758
10	0.0254902	0.1539924	5.282154	8.998623	8.435319
16	0.01520468	0.1007463	4.948042	14.11553	14.33872
20	0.0002135033	0.001315918	3.124584	5.794386	17.14861

	10	100	1000	10000	100000
1	1.230769	1.506173	0.3081723	0.03811119	0.002203739
2	0.124031	0.7093023	0.5871267	0.0762146	0.004328676
4	0.0733945	0.4357143	0.9878049	0.141748	0.00836766
8	0.0391198	0.253112	1.430085	0.2638025	0.01576061
10	0.03137255	0.2319392	1.627814	0.3429482	0.01858924
16	0.01871345	0.1517413	1.524849	0.5379597	0.03159879
20	0.0002627732	0.001982	0.9629101	0.2208309	0.03779105

	10	100	1000	10000	100000
1	1	1	1	1	1
2	0.1473684	0.2036613	0.6086393	0.9922183	0.6466596
4	0.08860759	0.1724806	0.4178033	0.5031005	0.4379175
8	0.05490196	0.1222527	0.4266594	0.4242426	0.655788
10	0.04713805	0.1115288	0.3081802	0.4445944	0.5044383
16	0.03655352	0.08248378	0.3062245	0.4657762	0.4962317
20	0.03063457	0.0007572084	0.2674132	0.1189727	0.4820026

	10	100	1000	10000	100000
1	0.8571429	1.576471	0.3314759	0.03836686	0.002292281
2	0.08510638	0.3059361	0.1791616	0.02760308	0.002624113
4	0.09022556	0.2696177	0.09060223	0.01911403	0.00076936
8	0.04562738	0.1654321	0.1081519	0.0137191	0.00086816
10	0.04669261	0.1398747	0.2314163	0.01711848	0.002791292
16	0.03252033	0.1392931	0.09790054	0.01741024	0.001071169
20	0.01355932	0.1247672	0.04817563	0.01558582	0.001039703